Tyƶ 4

Exercise 4

# access packages

library(MASS)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## āœ” tibble  2.1.3     āœ” purrr   0.3.2
## āœ” tidyr   0.8.3     āœ” dplyr   0.8.3
## āœ” readr   1.3.1     āœ” stringr 1.4.0
## āœ” tibble  2.1.3     āœ” forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## āœ– dplyr::filter() masks stats::filter()
## āœ– dplyr::lag()    masks stats::lag()
## āœ– dplyr::select() masks MASS::select()
library(dplyr)
library(knitr)
library(corrplot)
## corrplot 0.84 loaded
library(tidyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# load the data
data("Boston")

Summary of the Boston Data

This data frame contains the following columns: crim = per capita crime rate by town. zn = proportion of residential land zoned for lots over 25,000 sq.ft. indus = proportion of non-retail business acres per town. chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). nox = nitrogen oxides concentration (parts per 10 million). rm = average number of rooms per dwelling. age = proportion of owner-occupied units built prior to 1940. dis = weighted mean of distances to five Boston employment centres. rad = index of accessibility to radial highways. tax = full-value property-tax rate per $10,000. ptratio = pupil-teacher ratio by town. black = 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. lstat = lower status of the population (percent). medv = median value of owner-occupied homes in $1000s.

In this weeks exercise we use Boston data set from R MASS package which is a histoical data collected from 606 districts in the area around Boston.

Boston has 14 variables and 506 observations. Crime variable is the response variable.

Variables and their explanations are show above.

#Dataset summary and variables 

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
colnames(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
pairs(Boston)

head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
#Graphical summary of crime variable

ggplot(Boston, aes(crim)) + stat_density() + theme_bw()

#Plotting each variable against crime rate

bosmelt <- melt(Boston, id="crim")
ggplot(bosmelt, aes(x=value, y=crim))+
  facet_wrap(~variable, scales="free")+
  geom_point()

Graphical overview and summaries of variables

boxplot(Boston$crim, Boston$zn, Boston$indus, Boston$chas, Boston$nox, Boston$rm, Boston$age, Boston$dis, Boston$rad, Boston$tax, Boston$ptratio, Boston$black, Boston$lstat, Boston$medv, names = c("crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat", "medv"))

Making multible linear Regression model with all variables

mlm <- lm(formula = crim ~ ., data = Boston)
summary(mlm)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.924 -2.120 -0.353  1.019 75.051 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.033228   7.234903   2.354 0.018949 *  
## zn            0.044855   0.018734   2.394 0.017025 *  
## indus        -0.063855   0.083407  -0.766 0.444294    
## chas         -0.749134   1.180147  -0.635 0.525867    
## nox         -10.313535   5.275536  -1.955 0.051152 .  
## rm            0.430131   0.612830   0.702 0.483089    
## age           0.001452   0.017925   0.081 0.935488    
## dis          -0.987176   0.281817  -3.503 0.000502 ***
## rad           0.588209   0.088049   6.680 6.46e-11 ***
## tax          -0.003780   0.005156  -0.733 0.463793    
## ptratio      -0.271081   0.186450  -1.454 0.146611    
## black        -0.007538   0.003673  -2.052 0.040702 *  
## lstat         0.126211   0.075725   1.667 0.096208 .  
## medv         -0.198887   0.060516  -3.287 0.001087 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.4396 
## F-statistic: 31.47 on 13 and 492 DF,  p-value: < 2.2e-16

Multiple linear regression model intepratation

Most significant variables in the model are dis and rad with high significance, median value with moderate significance and zone, black with lower but still under p 0.05 significance.

Including Correlations

cor_matrix<-cor(Boston) 
cor_matrix %>% round(digits = 2)
##          crim    zn indus  chas   nox    rm   age   dis   rad   tax
## crim     1.00 -0.20  0.41 -0.06  0.42 -0.22  0.35 -0.38  0.63  0.58
## zn      -0.20  1.00 -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31
## indus    0.41 -0.53  1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72
## chas    -0.06 -0.04  0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04
## nox      0.42 -0.52  0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67
## rm      -0.22  0.31 -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29
## age      0.35 -0.57  0.64  0.09  0.73 -0.24  1.00 -0.75  0.46  0.51
## dis     -0.38  0.66 -0.71 -0.10 -0.77  0.21 -0.75  1.00 -0.49 -0.53
## rad      0.63 -0.31  0.60 -0.01  0.61 -0.21  0.46 -0.49  1.00  0.91
## tax      0.58 -0.31  0.72 -0.04  0.67 -0.29  0.51 -0.53  0.91  1.00
## ptratio  0.29 -0.39  0.38 -0.12  0.19 -0.36  0.26 -0.23  0.46  0.46
## black   -0.39  0.18 -0.36  0.05 -0.38  0.13 -0.27  0.29 -0.44 -0.44
## lstat    0.46 -0.41  0.60 -0.05  0.59 -0.61  0.60 -0.50  0.49  0.54
## medv    -0.39  0.36 -0.48  0.18 -0.43  0.70 -0.38  0.25 -0.38 -0.47
##         ptratio black lstat  medv
## crim       0.29 -0.39  0.46 -0.39
## zn        -0.39  0.18 -0.41  0.36
## indus      0.38 -0.36  0.60 -0.48
## chas      -0.12  0.05 -0.05  0.18
## nox        0.19 -0.38  0.59 -0.43
## rm        -0.36  0.13 -0.61  0.70
## age        0.26 -0.27  0.60 -0.38
## dis       -0.23  0.29 -0.50  0.25
## rad        0.46 -0.44  0.49 -0.38
## tax        0.46 -0.44  0.54 -0.47
## ptratio    1.00 -0.18  0.37 -0.51
## black     -0.18  1.00 -0.37  0.33
## lstat      0.37 -0.37  1.00 -0.74
## medv      -0.51  0.33 -0.74  1.00
corrplot.mixed(cor_matrix, number.cex = .6)

Corrplot shows the relationships between variables. Highest positive correlation are between rad and tax, indus and nox and age and nox. Highest negative correlations are between age and dis, lstat and med and dis and nox. Wee can see from the summaries that distribution of the variables is very inconsistent and thus we need to scale the dataset before doing linear discriminant analysis later.

Standardizing the dataset

# center and standardize variables
boston_scaled <- scale(Boston)

# summaries of the scaled variables
summary(boston_scaled)
##       crim                 zn               indus        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202  
##       chas              nox                rm               age         
##  Min.   :-0.2723   Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331  
##  1st Qu.:-0.2723   1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366  
##  Median :-0.2723   Median :-0.1441   Median :-0.1084   Median : 0.3171  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2723   3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059  
##  Max.   : 3.6648   Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164  
##       dis               rad               tax             ptratio       
##  Min.   :-1.2658   Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047  
##  1st Qu.:-0.8049   1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876  
##  Median :-0.2790   Median :-0.5225   Median :-0.4642   Median : 0.2746  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6617   3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058  
##  Max.   : 3.9566   Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372  
##      black             lstat              medv        
##  Min.   :-3.9033   Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.: 0.2049   1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median : 0.3808   Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4332   3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 0.4406   Max.   : 3.5453   Max.   : 2.9865
# change the object to data frame
boston_scaled <- as.data.frame(boston_scaled)

summary(boston_scaled)
##       crim                 zn               indus        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202  
##       chas              nox                rm               age         
##  Min.   :-0.2723   Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331  
##  1st Qu.:-0.2723   1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366  
##  Median :-0.2723   Median :-0.1441   Median :-0.1084   Median : 0.3171  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2723   3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059  
##  Max.   : 3.6648   Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164  
##       dis               rad               tax             ptratio       
##  Min.   :-1.2658   Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047  
##  1st Qu.:-0.8049   1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876  
##  Median :-0.2790   Median :-0.5225   Median :-0.4642   Median : 0.2746  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6617   3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058  
##  Max.   : 3.9566   Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372  
##      black             lstat              medv        
##  Min.   :-3.9033   Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.: 0.2049   1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median : 0.3808   Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4332   3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 0.4406   Max.   : 3.5453   Max.   : 2.9865

Scaling effects

With standardizing data is centralized. This is done to continuous variables on unit scale by subtracting the mean of the variable and dividing the result by the variable’s standard deviation. With this variablesĀ“mean is 0 and SD is 1.

Creating categorical variable of the crime rate

# creating a quantile vector of crim 
bins <- quantile(boston_scaled$crim)

crime <- cut(boston_scaled$crim, breaks = bins, labels = c("low", "med_low", "med_high", "high"), include.lowest = TRUE)

table(crime)
## crime
##      low  med_low med_high     high 
##      127      126      126      127
#removing crim
boston_scaled <- dplyr::select(boston_scaled, -crim)

#adding categorical variable to the table
boston_scaled <- data.frame(boston_scaled, crime)

Creating training set and set test

For predicting with data we need a model training set which is in this case decided to be 80% of the cases and the rest of the data is used as a test set which shows the accuracy of the model.

n <- nrow(boston_scaled)

#Choosing 80% to the training set
ind <- sample(n,  size = n * 0.8)

train <- boston_scaled[ind,]

# creating the test set 
test <- boston_scaled[-ind,]

Fitting linear discriminant analysis on the training set using crime rate as the target variable

# linear discriminant analysis
lda.fit <- lda(crime ~ ., data = train)

# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
##       low   med_low  med_high      high 
## 0.2623762 0.2623762 0.2326733 0.2425743 
## 
## Group means:
##                   zn      indus         chas        nox         rm
## low       0.94246961 -0.8861398 -0.086616792 -0.8584749  0.4172776
## med_low  -0.09568294 -0.2876778  0.024810573 -0.5458486 -0.1678931
## med_high -0.38597716  0.1559420  0.272163518  0.2419705  0.1260541
## high     -0.48724019  1.0149946  0.008892378  1.0187453 -0.3961889
##                 age        dis        rad        tax    ptratio      black
## low      -0.8501431  0.8332105 -0.6860868 -0.7495207 -0.4553104  0.3815981
## med_low  -0.2786532  0.3488071 -0.5495709 -0.4862676 -0.0404671  0.3207203
## med_high  0.3851708 -0.3617894 -0.4222990 -0.3244624 -0.1769990  0.1285327
## high      0.8016635 -0.8453834  1.6596029  1.5294129  0.8057784 -0.8182474
##                 lstat        medv
## low      -0.765523184  0.51177156
## med_low  -0.066958894 -0.01895333
## med_high  0.005529983  0.22534407
## high      0.850570684 -0.61922222
## 
## Coefficients of linear discriminants:
##                 LD1           LD2         LD3
## zn       0.12579419  0.7287538441 -0.86515160
## indus    0.06317001 -0.1742584222  0.20645570
## chas    -0.01673556 -0.0295499014  0.13862143
## nox      0.36367273 -0.8167730559 -1.48730173
## rm       0.03566250 -0.1311097176 -0.22321969
## age      0.23050850 -0.2857497101 -0.07923017
## dis     -0.09239703 -0.3191581635  0.27087850
## rad      3.61697774  1.1618394035 -0.01698668
## tax      0.04504058 -0.2346893667  0.64749352
## ptratio  0.12098583 -0.0175853421 -0.18835435
## black   -0.13570533  0.0007128124  0.09729742
## lstat    0.17096901 -0.4323050441  0.38780085
## medv     0.06511955 -0.5105539268 -0.15486243
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9617 0.0287 0.0096

LDA biplot

# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}

# target classes as numeric
classes <- as.numeric(train$crime)

# plot the lda results
l <- plot(lda.fit, dimen = 2, col = classes, pch = classes)
l + lda.arrows(lda.fit, myscale = 1)

## integer(0)

Prediction based on the lda model

# saving the correct classes from test data
correct_classes <- test$crime

# removing the crime variable from test data
test <- dplyr::select(test, -crime)

# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)

# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       17       3        1    0
##   med_low    6      13        1    0
##   med_high   0       8       22    2
##   high       0       0        1   28

From the cross table we can see that high values are predicted very nicely, but in the lower classes more errors occure.

Clustering, distances with euclidean distance

# Boston dataset reading and standardization again

data("Boston")
b_boston_scaled <- scale(Boston)

# Distances with euclidean distance
dist_eu <- dist(b_boston_scaled)
summary(dist_eu)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970

Clustering with K means with 3 cluster centers

km <- kmeans(b_boston_scaled, centers = 3)
set.seed(123)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(b_boston_scaled, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')

The optimal cluster size is the point where the line drops. In this it seems to be two.

# Clustering again
km2 <- kmeans(b_boston_scaled, centers = 2)
pairs(b_boston_scaled[,1:7], col = km2$cluster)

pairs(b_boston_scaled[,8:14], col = km2$cluster)

Bonus

km3 <- kmeans(Boston, centers = 3)
set.seed(123)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')

km4 <- kmeans(Boston, centers = 2)
pairs(Boston[,1:7], col = km4$cluster)

pairs(Boston[,8:14], col = km4$cluster)

bins <- quantile(Boston$crim)

crime2 <- cut(Boston$crim, breaks = bins, labels = c("low", "med_low", "med_high", "high"), include.lowest = TRUE)

table(crime2)
## crime2
##      low  med_low med_high     high 
##      127      126      126      127
Boston <- dplyr::select(Boston, -crim)

Boston <- data.frame(Boston, crime2)

u <- nrow(Boston)

ind2 <- sample(u,  size = u * 0.8)

train2 <- Boston[ind2,]

test2 <- Boston[-ind2,]

lda.fit2 <- lda(crime2 ~ ., data = train2)

lda.fit2
## Call:
## lda(crime2 ~ ., data = train2)
## 
## Prior probabilities of groups:
##       low   med_low  med_high      high 
## 0.2524752 0.2500000 0.2351485 0.2623762 
## 
## Group means:
##                 zn     indus       chas       nox       rm      age
## low      33.348039  5.173333 0.03921569 0.4534382 6.584784 43.77549
## med_low   8.094059  9.092871 0.04950495 0.4921772 6.197436 59.68119
## med_high  2.547368 12.335158 0.15789474 0.6040526 6.402337 81.93684
## high      0.000000 18.100000 0.05660377 0.6798208 6.005179 91.18208
##               dis       rad      tax  ptratio    black     lstat     medv
## low      5.657954  3.490196 282.8333 17.54412 390.9058  7.175784 27.00098
## med_low  4.467285  4.742574 321.8812 18.31683 385.1081 11.669109 22.51485
## med_high 2.911674  5.736842 349.4211 17.51789 364.4000 12.678211 24.52947
## high     2.027506 24.000000 666.0000 20.20000 278.7887 18.729623 15.95849
## 
## Coefficients of linear discriminants:
##                   LD1           LD2           LD3
## zn       0.0059594595  0.0291392963  -0.043654152
## indus    0.0116837992 -0.0194328871   0.028512310
## chas    -0.4398527204 -0.4347835204  -0.196120764
## nox      1.9916327959 -5.6218782002 -10.346653879
## rm      -0.1892416869 -0.1153374711  -0.261021540
## age      0.0080682351 -0.0142044048  -0.007142634
## dis     -0.0553948278 -0.0682297733   0.054422317
## rad      0.4673782227  0.1037951245  -0.004063668
## tax      0.0004496804 -0.0003602709   0.002638328
## ptratio  0.0678345162  0.0524262621  -0.082236100
## black   -0.0015035340  0.0003945387   0.001896470
## lstat    0.0294406326 -0.0448026121   0.058101833
## medv     0.0234646334 -0.0462138320  -0.009722294
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9645 0.0275 0.0081
lda.arrows2 <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}
classes2 <- as.numeric(train2$crime2)

# plot the lda results
l <- plot(lda.fit2, dimen = 2, col = classes2, pch = classes2)
l + lda.arrows2(lda.fit2, myscale = 2)
## Warning in arrows(x0 = 0, y0 = 0, x1 = myscale * heads[, choices[1]], y1 =
## myscale * : zero-length arrow is of indeterminate angle and so skipped

## integer(0)

Nox and seems to be the most influencal linear separators in analysis without standardization.

Super Bonus

I don“t get the colors right. Otherwise nice

model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)